Analysing Data



In [114]:

    
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns



In [101]:

    
test=pd.read_csv("test.csv")
test.head() #visualising last 10 data
# print g_model.loc[417,"Survived"] #individual visualisation









    Out[101]:






  
    
      
      PassengerId
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      892
      3
      Kelly, Mr. James
      male
      34.5
      0
      0
      330911
      7.8292
      NaN
      Q
    
    
      1
      893
      3
      Wilkes, Mrs. James (Ellen Needs)
      female
      47.0
      1
      0
      363272
      7.0000
      NaN
      S
    
    
      2
      894
      2
      Myles, Mr. Thomas Francis
      male
      62.0
      0
      0
      240276
      9.6875
      NaN
      Q
    
    
      3
      895
      3
      Wirz, Mr. Albert
      male
      27.0
      0
      0
      315154
      8.6625
      NaN
      S
    
    
      4
      896
      3
      Hirvonen, Mrs. Alexander (Helga E Lindqvist)
      female
      22.0
      1
      1
      3101298
      12.2875
      NaN
      S



In [112]:

    
mData=pd.read_csv("train.csv")
mData.head()









    Out[112]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35
      0
      0
      373450
      8.0500
      NaN
      S



In [103]:

    
mData.info()
print "_________________________________________________"
test.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 73.1+ KB
_________________________________________________
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 31.0+ KB

Drop Non Required Coloumn



In [113]:

    
mData.drop(["PassengerId","Name","Ticket"], axis=1)
test.drop(["Name","Ticket"], axis=1)









    Out[113]:






  
    
      
      PassengerId
      Pclass
      Sex
      Age
      SibSp
      Parch
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      892
      3
      male
      34.5
      0
      0
      7.8292
      NaN
      Q
    
    
      1
      893
      3
      female
      47.0
      1
      0
      7.0000
      NaN
      S
    
    
      2
      894
      2
      male
      62.0
      0
      0
      9.6875
      NaN
      Q
    
    
      3
      895
      3
      male
      27.0
      0
      0
      8.6625
      NaN
      S
    
    
      4
      896
      3
      female
      22.0
      1
      1
      12.2875
      NaN
      S
    
    
      5
      897
      3
      male
      14.0
      0
      0
      9.2250
      NaN
      S
    
    
      6
      898
      3
      female
      30.0
      0
      0
      7.6292
      NaN
      Q
    
    
      7
      899
      2
      male
      26.0
      1
      1
      29.0000
      NaN
      S
    
    
      8
      900
      3
      female
      18.0
      0
      0
      7.2292
      NaN
      C
    
    
      9
      901
      3
      male
      21.0
      2
      0
      24.1500
      NaN
      S
    
    
      10
      902
      3
      male
      NaN
      0
      0
      7.8958
      NaN
      S
    
    
      11
      903
      1
      male
      46.0
      0
      0
      26.0000
      NaN
      S
    
    
      12
      904
      1
      female
      23.0
      1
      0
      82.2667
      B45
      S
    
    
      13
      905
      2
      male
      63.0
      1
      0
      26.0000
      NaN
      S
    
    
      14
      906
      1
      female
      47.0
      1
      0
      61.1750
      E31
      S
    
    
      15
      907
      2
      female
      24.0
      1
      0
      27.7208
      NaN
      C
    
    
      16
      908
      2
      male
      35.0
      0
      0
      12.3500
      NaN
      Q
    
    
      17
      909
      3
      male
      21.0
      0
      0
      7.2250
      NaN
      C
    
    
      18
      910
      3
      female
      27.0
      1
      0
      7.9250
      NaN
      S
    
    
      19
      911
      3
      female
      45.0
      0
      0
      7.2250
      NaN
      C
    
    
      20
      912
      1
      male
      55.0
      1
      0
      59.4000
      NaN
      C
    
    
      21
      913
      3
      male
      9.0
      0
      1
      3.1708
      NaN
      S
    
    
      22
      914
      1
      female
      NaN
      0
      0
      31.6833
      NaN
      S
    
    
      23
      915
      1
      male
      21.0
      0
      1
      61.3792
      NaN
      C
    
    
      24
      916
      1
      female
      48.0
      1
      3
      262.3750
      B57 B59 B63 B66
      C
    
    
      25
      917
      3
      male
      50.0
      1
      0
      14.5000
      NaN
      S
    
    
      26
      918
      1
      female
      22.0
      0
      1
      61.9792
      B36
      C
    
    
      27
      919
      3
      male
      22.5
      0
      0
      7.2250
      NaN
      C
    
    
      28
      920
      1
      male
      41.0
      0
      0
      30.5000
      A21
      S
    
    
      29
      921
      3
      male
      NaN
      2
      0
      21.6792
      NaN
      C
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      388
      1280
      3
      male
      21.0
      0
      0
      7.7500
      NaN
      Q
    
    
      389
      1281
      3
      male
      6.0
      3
      1
      21.0750
      NaN
      S
    
    
      390
      1282
      1
      male
      23.0
      0
      0
      93.5000
      B24
      S
    
    
      391
      1283
      1
      female
      51.0
      0
      1
      39.4000
      D28
      S
    
    
      392
      1284
      3
      male
      13.0
      0
      2
      20.2500
      NaN
      S
    
    
      393
      1285
      2
      male
      47.0
      0
      0
      10.5000
      NaN
      S
    
    
      394
      1286
      3
      male
      29.0
      3
      1
      22.0250
      NaN
      S
    
    
      395
      1287
      1
      female
      18.0
      1
      0
      60.0000
      C31
      S
    
    
      396
      1288
      3
      male
      24.0
      0
      0
      7.2500
      NaN
      Q
    
    
      397
      1289
      1
      female
      48.0
      1
      1
      79.2000
      B41
      C
    
    
      398
      1290
      3
      male
      22.0
      0
      0
      7.7750
      NaN
      S
    
    
      399
      1291
      3
      male
      31.0
      0
      0
      7.7333
      NaN
      Q
    
    
      400
      1292
      1
      female
      30.0
      0
      0
      164.8667
      C7
      S
    
    
      401
      1293
      2
      male
      38.0
      1
      0
      21.0000
      NaN
      S
    
    
      402
      1294
      1
      female
      22.0
      0
      1
      59.4000
      NaN
      C
    
    
      403
      1295
      1
      male
      17.0
      0
      0
      47.1000
      NaN
      S
    
    
      404
      1296
      1
      male
      43.0
      1
      0
      27.7208
      D40
      C
    
    
      405
      1297
      2
      male
      20.0
      0
      0
      13.8625
      D38
      C
    
    
      406
      1298
      2
      male
      23.0
      1
      0
      10.5000
      NaN
      S
    
    
      407
      1299
      1
      male
      50.0
      1
      1
      211.5000
      C80
      C
    
    
      408
      1300
      3
      female
      NaN
      0
      0
      7.7208
      NaN
      Q
    
    
      409
      1301
      3
      female
      3.0
      1
      1
      13.7750
      NaN
      S
    
    
      410
      1302
      3
      female
      NaN
      0
      0
      7.7500
      NaN
      Q
    
    
      411
      1303
      1
      female
      37.0
      1
      0
      90.0000
      C78
      Q
    
    
      412
      1304
      3
      female
      28.0
      0
      0
      7.7750
      NaN
      S
    
    
      413
      1305
      3
      male
      NaN
      0
      0
      8.0500
      NaN
      S
    
    
      414
      1306
      1
      female
      39.0
      0
      0
      108.9000
      C105
      C
    
    
      415
      1307
      3
      male
      38.5
      0
      0
      7.2500
      NaN
      S
    
    
      416
      1308
      3
      male
      NaN
      0
      0
      8.0500
      NaN
      S
    
    
      417
      1309
      3
      male
      NaN
      1
      1
      22.3583
      NaN
      C
    
  

418 rows × 9 columns

Plotting Some data



In [128]:

    
sns.factorplot('Embarked','Survived', data=mData,size=4,aspect=3)

#divide screen in 3
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
sns.countplot(x='Embarked', data=mData, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=mData, order=[1,0], ax=axis2)

#Below Is Wow Feature
# group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = mData[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)









    Out[128]:





<matplotlib.axes.AxesSubplot at 0x91999d0>

#Apply some priproccesing

Fare



In [130]:

    
# only for test, since there is a missing "Fare" values
test["Fare"].fillna(test["Fare"].median(), inplace=True) #replace missing value by median

# convert from float to int
mData['Fare'] = mData['Fare'].astype(int)
test['Fare']    = test['Fare'].astype(int)

# get fare for survived & didn't survive passengers 
fare_not_survived = mData["Fare"][mData["Survived"] == 0]
fare_survived     = mData["Fare"][mData["Survived"] == 1]

# get average and std for fare of survived/not survived passengers
avgerage_fare = pd.DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare      = pd.DataFrame([fare_not_survived.std(), fare_survived.std()])



In [138]:

    
# plot
mData['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,80))









    Out[138]:





<matplotlib.axes.AxesSubplot at 0xbff3e70>



In [143]:

    
avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare



In [147]:

    
avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False)









    Out[147]:





<matplotlib.axes.AxesSubplot at 0xc667470>

Age



In [150]:

    
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')


# get average, std, and number of NaN values in titanic_df
average_age_titanic   = mData["Age"].mean()
std_age_titanic       = mData["Age"].std()
count_nan_age_titanic = mData["Age"].isnull().sum()

# get average, std, and number of NaN values in test
average_age_test   = test["Age"].mean()
std_age_test       = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()

# convert from float to int
mData['Age'] = mData['Age'].astype(int)
test['Age']    = test['Age'].astype(int)

# plot original Age values
mData['Age'].hist(bins=70, ax=axis1)
  
# generate random numbers between (mean - std) & (mean + std) ## WOW
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

# fill NaN values in Age column with random values generated
mData["Age"][np.isnan(mData["Age"])] = rand_1
test["Age"][np.isnan(test["Age"])] = rand_2


      
# plot new Age Values
mData['Age'].hist(bins=70, ax=axis2)









    



c:\python27\lib\site-packages\ipykernel\__main__.py:29: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
c:\python27\lib\site-packages\ipykernel\__main__.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[150]:





<matplotlib.axes.AxesSubplot at 0xd292170>



In [151]:

    
# .... continue with plot Age column

# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(mData, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, mData['Age'].max()))
facet.add_legend()

# average survived passengers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = mData[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)









    Out[151]:





<matplotlib.axes.AxesSubplot at 0xd6de190>

Family



In [152]:

    
# Family

# Instead of having two columns Parch & SibSp, 
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
mData['Family'] =  mData['Parch'] + mData['SibSp']

mData['Family'].loc[mData['Family'] > 0] = 1
mData['Family'].loc[mData['Family'] == 0] = 0

test['Family'] =  test['Parch'] + test['SibSp']
test['Family'].loc[test['Family'] > 0] = 1
test['Family'].loc[test['Family'] == 0] = 0

# drop Parch & SibSp
mData = mData.drop(['SibSp','Parch'], axis=1)
test    = test.drop(['SibSp','Parch'], axis=1)

# plot
fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5))

# sns.factorplot('Family',data=titanic_df,kind='count',ax=axis1)
sns.countplot(x='Family', data=mData, order=[1,0], ax=axis1)

# average of survived for those who had/didn't have any family member
family_perc = mData[["Family", "Survived"]].groupby(['Family'],as_index=False).mean()
sns.barplot(x='Family', y='Survived', data=family_perc, order=[1,0], ax=axis2)

axis1.set_xticklabels(["With Family","Alone"], rotation=0)









    



c:\python27\lib\site-packages\pandas\core\indexing.py:117: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)






    Out[152]:





[<matplotlib.text.Text at 0xda7d530>, <matplotlib.text.Text at 0xda91ab0>]

Sex



In [153]:

    
# Sex

# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age,sex = passenger
    return 'child' if age < 16 else sex
    
mData['Person'] = mData[['Age','Sex']].apply(get_person,axis=1)
test['Person']    = test[['Age','Sex']].apply(get_person,axis=1)

# No need to use Sex column since we created Person column
mData.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)

# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_titanic  = pd.get_dummies(mData['Person'])
person_dummies_titanic.columns = ['Male','Female','Child']
person_dummies_titanic.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test['Person'])
person_dummies_test.columns = ['Male','Female','Child']
person_dummies_test.drop(['Male'], axis=1, inplace=True)

mData = mData.join(person_dummies_titanic)
test    = test.join(person_dummies_test)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))

# sns.factorplot('Person',data=titanic_df,kind='count',ax=axis1)
sns.countplot(x='Person', data=mData, ax=axis1)

# average of survived for each Person(male, female, or child)
family_perc = mData[["Person", "Survived"]].groupby(['Person'],as_index=False).mean()
sns.barplot(x='Person', y='Survived', data=family_perc, ax=axis2, order=['male','female','child'])

mData.drop(['Person'],axis=1,inplace=True)
test.drop(['Person'],axis=1,inplace=True)



In [ ]:

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35	0	373450	8.0500	NaN	S